###########################################################################################
### Estimating and Evaluating Heterogeneous Treatment Effects: A Causal Forest Approach ###
###                 By Li Zheng and Weiwen Yin, at Research & Politics                  ###
###########################################################################################





install.packages("interplot")
install.packages("ggpubr")
install.packages("foreign")
install.packages("interflex")
install.packages("stringi")
install.packages("ggplot2")
install.packages("dplyr")    
install.packages("fBasics")     
install.packages("grf")         
install.packages("devtools")    
install.packages("xtable")
install.packages("aod")
install.packages("tidyr")
install.packages("texreg")
install.packages("sandwich")
install.packages("lmtest")
install.packages("ggpubr")
install.packages("dummies")


##### Replication Case I: Oreopoulos (2011, AEJ)

rm(list = ls())
# update.packages(ask=F)
library(foreign)
library(interflex)
library(stringi)
library(ggplot2)
library(dplyr)    
library(fBasics)     
library(grf)         
library(devtools)    
library(xtable)
library(aod)
library(tidyr)
library(texreg)
library(sandwich)
library(lmtest)
library(ggpubr)

## set root directory of the replication file, remember to create Data and Graphs folders below
path <- "" 

## set graph folder
graphpath<-paste(path,"Graphs/",sep="")



#### 0. Preparation

name<-"Oreopoulos2011"
d<-read.dta(paste0(path,"Data/Oreopoulos2011skilled/Oreopoulos2011skilled.dta"),convert.factor=FALSE) # one of the four datasets
dim(d)

Y="callback" 
D="canadian_name" #treatment
Z<-c("female", "ba_quality", "extracurricular_skills", "language_skills",
     "ma", "same_exp", "exp_highquality", "reference", "accreditation", "legal")  #Table 5�? Panel A, second row 

df = d[,c(Y,D,Z)]
df = na.omit(df)
covariate_names = Z



#### 1. Step 1: Fit Causal Forest

### 1.1. Fit the forest

cf <- causal_forest(
  X = as.matrix(df[,covariate_names]),
  Y = df[,Y],
  W = df[,D],
  seed = 2019)


### 1.2. Predict point estimates and standard errors (training set, out-of-bag)

pred <- predict(cf, estimate.variance=TRUE)
tauhat <- pred$predictions
tauhat_se <- sqrt(pred$variance.estimates)


### 1.3. Average Treatment Effects

## AIPW for ATE in grf # sample is treated 
ate = average_treatment_effect(cf,target.sample = "treated",method = "AIPW")
ate
pvalue.ate = 2*pnorm(-abs(ate[1]/ate[2]))
pvalue.ate
xtable(data.frame(ate),digits = 3)


### 1.4. Number of significant individual HTE
result <- data.frame(tauhat, tauhat_se)
result$hte_upper=result$tauhat+result$tauhat_se*1.96
result$hte_lower=result$tauhat-result$tauhat_se*1.96
result$significant=1
result$significant[(result$hte_upper>0) & (result$hte_lower<0)]=0
sig.prop = nrow(result[result$significant == 1,])/nrow(result)
sig.prop

## summary stat of the siginificant and insigificant group
# sum stat
s1 = summary(result[with(result,significant==1),"tauhat"])
s0 = summary(result[with(result,significant==0),"tauhat"])
xtable(rbind(s1,s0),digits = 3)
# number of positive and negative hte
nrow(result[with(result,significant==1),])
nrow(result[with(result,significant==1 & tauhat>=0),])
nrow(result[with(result,significant==1 & tauhat<0),])
nrow(result[with(result,significant==0),])
nrow(result[with(result,significant==0 & tauhat>=0),])
nrow(result[with(result,significant==0 & tauhat<0),])


### 1.5. Histogram
pdf(paste(graphpath,name,"_hist.pdf",sep=""))
ggplot(data.frame(tauhat),aes(x=tauhat)) + 
  geom_histogram(color="grey",bins=30) + 
  geom_vline(aes(xintercept = ate[1],col="ATE")) + 
  scale_color_manual(name="legend",values = c(ATE="red")) +
  xlab("Treatment Effects") + 
  ylab("Count")
dev.off()



#### 2. Step 2: Assessing Treatment Effect Heterogeneity

###  BLP Test
tc <- test_calibration(cf)
tc
texreg(tc,digits = 3)



#### 3. Step 3: Finding Important Moderators

## compute variable importance for all moderating variables in the causal forest
var_imp <- c(variable_importance(cf))
names(var_imp) <- covariate_names
sorted_var_imp <- sort(var_imp, decreasing=TRUE)
sorted_var_imp
xtable(data.frame(sorted_var_imp),digits=3)



#### 4. Step 4: Evaluating Treatment Effect Heterogeneity

# Linear regression without interaction terms
ols = lm(callback~.,data = df)
summary(ols)
ols.hbs = coeftest(ols, vcov = vcovHC(ols, "HC1"))
ols.hbs

# Linear regression with interaction terms
interaction.lm = lm(callback~.+canadian_name*female+canadian_name*ba_quality,data=df[,1:12])
summary(interaction.lm)
interaction.lm.hbs = coeftest(interaction.lm, 
                              vcov = vcovHC(interaction.lm,"HC1"))
interaction.lm.hbs

texreg(list(ols.hbs,interaction.lm.hbs),digits = 3)




##### Replication Case II: Huddy et al. (2015, APSR)


rm(list = ls())
# update.packages(ask=F)
library(foreign)
library(interflex)
library(stringi)
library(ggplot2)
library(geoR)
library(plotly)
library(tidyr)
library(dplyr)       # Data manipulation (0.8.0.1)
library(fBasics)     # Summary statistics (3042.89)
library(grf)         # Generalized random forests (0.10.2)
library(devtools)    # Install packages from github (2.0.1)
library(Hmisc)
library(xtable)
library(ggpubr)

### set root directory of the replication file, remember to create Data and Graphs folders below
path <- ""

## set graph folder
graphpath<-paste(path,"Graphs/",sep="")


### Preparation

d<-read.dta(paste0(path,"Data/Huddy_APSR_2015/rep_huddy_2015a.dta"),convert.factor=FALSE)
name<-"huddy_2015a"
Y="totangry" 
D="threat" 
X="pidentity"
Z<-c("issuestr2","pidstr2","knowledge","educ","male","age10")


### Fit the forest

df = d[,c(Y,X,D,Z)]
df = na.omit(df)
covariate_names = c(X,Z)

cf <- causal_forest(
  X = as.matrix(df[,covariate_names]),
  Y = df[,Y],
  W = df[,D],
  num.trees=5000,
  seed = 2019)


### Predict point estimates and standard errors (training set, out-of-bag)

pred <- predict(cf, estimate.variance=TRUE)
tauhat <- pred$predictions
tauhat_se <- sqrt(pred$variance.estimates)

result <- data.frame(tauhat, tauhat_se)
result$hte_upper=result$tauhat+result$tauhat_se*1.96
result$hte_lower=result$tauhat-result$tauhat_se*1.96
result$significant=1
result$significant[(result$hte_upper>0) & (result$hte_lower<0)]=0
sig.prop = nrow(result[result$significant == "1",])/nrow(result)


### Look at the Histogram
pdf(paste(graphpath,name,"_cf_hist.pdf",sep=""))
hist(tauhat, main="HTE with Causal Forests",col="grey",
     xlab="Treatment Effects",ylim=c(0,200))
dev.off()


#### Step 4': Evaluating Moderating Effect along Multiple Variables

mod_var = "pidentity"
mod_var2 = "issuestr2"

mv1list = c(mod_var,mod_var2)
mv2list = setdiff(covariate_names,mv1list)
plist = list()
k=0

for (i in 1:length(mv1list)) {
  for (j in 1:length(mv2list)) {
    k = k+1
    ## the moderating variables
    mv1 = mv1list[i]
    mv2 = mv2list[j]
    ## make grids on the moderating variables
    x = seq(min(df[,mv1]),max(df[,mv1]),length=20)
    df_newx <- setNames(data.frame(x), mv1)
    y = seq(min(df[,mv2]),max(df[,mv2]),length=20)
    df_newy <- setNames(data.frame(y), mv2)
    ## other variables at the median level
    vars_of_interest = c(mv1,mv2)
    other_covariates <- covariate_names[which(!covariate_names %in% vars_of_interest)]
    df_median <- data.frame(lapply(df[,other_covariates], median))
    df_new = crossing(df_median,df_newx,df_newy)
    ## prediction
    tauhat_2mv = predict(cf,newdata=as.matrix(df_new[,covariate_names]),
                         estimate.variance=TRUE)$predictions
    df_new$cate = tauhat_2mv
    ## plot the HTE w.r.t. the two mv's, using the heatmap
    plist[[k]] = ggplot(data=df_new, aes_string(mv1, mv2, fill="cate")) + geom_tile() + 
      scale_fill_gradient(low="blue", high="red")
  }
}

png(paste(graphpath,name,"_cf_2mv.png",sep=""))
ggarrange(plist[[3]],plist[[5]],plist[[2]],plist[[8]],plist[[10]],plist[[7]],ncol=3,nrow=2)
dev.off()
